python source code of echoDoc0.1

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from mpl_toolkits.mplot3d import Axes3D
from logging import basicConfig, INFO
from random import shuffle
from sklearn import svm
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA



class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        deck = []
        for line in open(self.filename, encoding="utf-8"):
            deck.append(line)
            if len(deck) >= 10000000:
                shuffle(deck)
                for card in deck:
                    csv = card.split(",")
                    subreddit = csv[0]
                    body = csv[1].split()
                    yield TaggedDocument(words=body, tags=[subreddit, clusterLabel[subreddit]])
                deck = []


def trainNewModel(inputFile, outputFile, model):
    documents = LabeledLineSentence("Data\\" + inputFile)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count)
    model.save("Models\\" + outputFile)


def retrainModel(vectorFile, dataFile, outputFile, iterations):
    documents = LabeledLineSentence("Data\\" + dataFile)
    model = Doc2Vec.load("Models\\" + vectorFile)
    for epoch in range(iterations):
        model.train(documents)
    model.save("Models\\" + outputFile)


def testModel(inputFile):
    model = Doc2Vec.load("Models\\" + inputFile)
    while True:
        choice = input("Press 1 to compare documents within the model to each other.\n"
                       "Press 2 to run similarity tests on individual words.\n"
                       "Press 3 to get the top related subreddits for an inferred new vector (comment).\n"
                       "Hit any key to exit.\n")
        if choice == "1":
            docChoice = input("Enter the subreddit you want to test.\n")
            print(model.docvecs.most_similar(docChoice))
        elif choice == "2":
            wordChoice = input("Enter the word you wish to analyze.\n").lower()
            print(model.most_similar(wordChoice))
        elif choice == "3":
            with open("testing.txt") as t:
                resultList = []
                testDocs = t.readlines()
                for doc in testDocs:
                    doc = doc.split("\t")
                    tag = doc[0]
                    body = doc[1]
                    newVec = model.infer_vector(body.split())
                    resultList.append("The original category is {}: {}\n {}\n".
                                      format(tag, body, model.docvecs.most_similar(positive=[newVec])))
                with open("clusteredResults.txt", "a") as x:
                    for element in resultList:
                        x.write(element)
        else:
            break


def newKMeansModel(vectorFile, outputFile, numClusters):
    # https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering

    model = Doc2Vec.load("Models\\" + vectorFile)
    docVecs = model.docvecs.doctag_syn0
    km = KMeans(n_clusters=numClusters)
    print("Starting")
    km.fit(docVecs)
    print("Fitting Data")
    joblib.dump(km, outputFile)


def loadKMeansModel(vectorFile, clusterFile, csvFile):
    # https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering

    model = Doc2Vec.load("Models\\" + vectorFile)
    km = joblib.load(clusterFile)
    clusters = km.labels_.tolist()
    cluster_info = {'labels': model.docvecs.offset2doctag,
                    "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in model.docvecs.offset2doctag],
                    'clusters': clusters}
    sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
                              columns=['labels', "index, wordcount and repeated words", 'clusters'])
    print(sentenceDF)
    sentenceDF.to_csv(csvFile)


def newDBSCANModel(vectorFile, outputFile):
    model = Doc2Vec.load("Models\\" + vectorFile)
    vecs = []
    for doc in range(0, len(model.docvecs)):
        doc_vec = model.docvecs[doc]
        # print doc_vec
        vecs.append(doc_vec.reshape((1, 300)))

    doc_vecs = np.array(vecs, dtype='float')  # TSNE expects float type values

    # print doc_vecs
    docs = []
    for i in doc_vecs:
        docs.append(i[0])
    db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
    joblib.dump(db, outputFile)


    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters = db.labels_.tolist()
    cluster_info = {'labels': model.docvecs.offset2doctag,
                    "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
                                                            model.docvecs.offset2doctag],
                    'clusters': clusters}
    sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
                              columns=['labels', "index, wordcount and repeated words", 'clusters'])
    print(sentenceDF)
    sentenceDF.to_csv("DBSCAN.csv")

    print('Estimated number of clusters: %d' % n_clusters_)


def plotModel2D(vectorFile, numClusters):
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html

    model = Doc2Vec.load("Models\\" + vectorFile)
    docVecs = model.docvecs.doctag_syn0
    reduced_data = PCA(n_components=10).fit_transform(docVecs)
    kmeans = KMeans(init='k-means++', n_clusters=numClusters, n_init=10)
    kmeans.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap="hot",
               aspect='auto', origin='lower')
    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)

    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title('K-means clustering on Reddit Text Data(PCA-reduced data)\n'
              'Centroids are marked with white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()


def plotModel3D(vectorFile, numClusters):
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html

    model = Doc2Vec.load("Models\\" + vectorFile)
    docVecs = model.docvecs.doctag_syn0
    reduced_data = PCA(n_components=10).fit_transform(docVecs)
    kmeans = KMeans(init='k-means++', n_clusters=numClusters, n_init=10)

    fig = plt.figure(1, figsize=(10, 10))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    kmeans.fit(reduced_data)
    labels = kmeans.labels_

    ax.scatter(reduced_data[:, 5], reduced_data[:, 2], reduced_data[:, 3], c=labels.astype(np.float))
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    # Plot the ground truth
    fig = plt.figure(1, figsize=(10, 10))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    plt.cla()
    ax.scatter(reduced_data[:, 5], reduced_data[:, 2], reduced_data[:, 3], c=labels.astype(np.float))
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    plt.show()


def clusterLabeler(csvFile):
    with open(csvFile, 'r') as csvfile:
        read = csv.reader(csvfile, delimiter=',')
        mydict = {rows[1]: rows[0] for rows in read}

        return mydict


basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=INFO)

# Use to create a dictionary of subreddits and their respective clusters. If not using, remove the 'clusterLabel' tag in LabeledLineSentence
# clusterLabel = clusterLabeler("clusters.csv")

# Train a new model from scratch based on your own corpus. Depending on your CPU you may need to change the number of workers
# trainNewModel("miniFeb2017SubredditIncluded.txt", "yourOutputFile", Doc2Vec(dm=0, iter=20, dm_mean=1, size=300, window=5, negative=5, min_count=10, workers=7))

# Retrain a model for some number of iterations, using the same vocabulary as before.
# retrainModel("Doc2VecFeb2017MiniCorpus", "miniFeb2017SubredditIncluded.txt", "Doc2VecFeb2017UpdatedMiniCorpus", 3)

# Run testing suit on model
# testModel("clusteredModel")

# Create a new KMeans model, and save it
# newKMeansModel("Doc2VecFeb2017MiniCorpus", "KMeans_cluster.pkl", 40)

# Load a KMeans model and export the resulting clusters to a csv, along with subreddit name and other information
# loadKMeansModel("Doc2VecFeb2017MiniCorpus", "KMeans_cluster.pkl", "clusters.csv")

# Use demntionality reduction to plot KMeans clusters
# plotModel2D("Doc2VecFeb2017MiniCorpus", 40)

# Use demntionality reduction to plot KMeans clusters in 3D
# plotModel3D("Doc2VecFeb2017MiniCorpus", 40)

# Use DBSCAN to cluster. Results were very suboptimal, with most subreddits belonging to a single massive cluster
# newDBSCANModel("Doc2VecFeb2017MiniCorpus", "yourOutput.pkl")



# Feature Forthcoming: Classification
# model = Doc2Vec.load("clusteredModel")
#
# classifier = svm.SVC
# classifier.fit(train_array, train_labels)
# print(classifier.score(test_array, test_labels))